#coding=utf8
import sys
reload(sys)
sys.setdefaultencoding('utf8')

import jieba
import pickle
import requests
import jieba
import jieba.analyse
from scrapy import Selector
from model import SinaBlog

f_handler=open('result_origin.txt', 'w')
f2_handler=open('result_zhihu.txt', 'w')
sys.stdout=f_handler

topic_set = pickle.load(open('zhihu/topic_set.pkl'))

def get_theme(text):
	s_set = set(jieba.analyse.extract_tags(text))
	same = list(s_set&topic_set)
	if len(same)<3: return list(s_set)
	return list(same)

for blog in SinaBlog.objects:
	machine_tags = get_theme(blog.text)

	print 'url:',blog.url
	print 'title:',blog.title
	print 'machine:'+' '.join(machine_tags)
	print 'origin:'+' '.join(blog.tags)
	print ''

jieba.analyse.set_idf_path('zhihu/idf-zhihu.txt.big')

sys.stdout=f2_handler

for blog in SinaBlog.objects:
	machine_tags = get_theme(blog.text)

	print 'url:',blog.url
	print 'title:',blog.title
	print 'machine:'+' '.join(machine_tags)
	print 'origin:'+' '.join(blog.tags)
	print ''
